The purpose of exploratory analysis is to perform initial investigations on the data so as to discover patterns, identify any anomalies, to test initial hypothesis and check assumptions made at the start of the project.

In this section of the project we will ask two types of questions:

  1. what type of variation occurs within my variables?
  2. what type of covariation occurs between my variables?

Variation being the tendency for values of a variable to change from measurement to measurement, and covariance is the tendency for two or more variables to vary together in a related way.

pacman::p_load(
  tidyverse,
  here,
  RColorBrewer,
  lubridate,
  scales,
  GGally,
  stats,
  corrplot,
  mice,
  VIM
)
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))
Rows: 327346 Columns: 38
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (11): origin, origin_name, dest, dest_name, carrier, carrier_name, tailnum, manufacturer, model, engine, type
dbl  (22): dep_delay, arr_delay, air_time, distance, flight, engines, seats, aircraft_age, lat, lon, alt, wind_di...
dttm  (5): dep_time, sched_dep_time, arr_time, sched_arr_time, time_hour

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

Which airport in our dataset has the highest number of departures?

flights_dt %>% 
  group_by(origin) %>%
  ggplot() +
  aes(x = origin, fill = origin) +
  geom_bar() +
  scale_fill_manual(values = cbPalette) +
  labs(
    x = "airport",
    y = "departure numbers",
    title = "Departure numbers by airport"
  ) +
  guides(fill = "none") +
  theme_bw()

What is the trend of delays over the year, how does it compare across the three airports?

flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = mean_delay, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "mean delay (minutes)",
    title = " Mean delay by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()

What is the number of departure delays over the year?

flights_dt %>%
  filter(dep_delay > 0) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_delays, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "number of delays",
    title = " Monthly trend of departure delays",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()

What is the number of flights per month?

flights_dt %>%
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_flights = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_flights, fill = origin) +
  geom_col(position = "dodge", alpha = 0.8) +
  labs(
    x = "month",
    y = "number of departures",
    title = "Departure numbers by month",
    fill = "airport"
  ) +
  scale_fill_manual(values = cbPalette) +
  theme_bw()

Which carrier has the most departure delays?

flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean departure delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()

Which carrier has the most arrival delays?

flights_dt %>% 
  filter(arr_delay > 0) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(arr_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean arrival delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()

Which season has the highest number of departure delays?

flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = day, y = mean_delay, colour = origin) +
  geom_point(alpha = 0.8) +
  labs(
    x = "date",
    y = "mean delay (minutes)",
    title = "Mean departure delay by month"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()

Over the course of a day when is the largest average delay?

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  select(dep_delay, hour) %>%
  group_by(hour) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = hour, y = mean_delay) +
  geom_point(alpha = 0.8) +
  geom_smooth(se = FALSE) +
  labs(
    x = "hour",
    y = "mean delay (minutes)",
    title = "Mean departure delay by departure time"
  ) +
  theme_bw() +
  scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

Which destinations suffer from the longest delays?

flights_dt %>%
  drop_na(dest_name) %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  arrange(desc(mean_delay)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "mean delay (minutes)",
    title = "Destinations with longest delays"
  ) +
  theme_bw() +
  coord_flip()

Trend weather conditions against mean departure delays

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_visibility = mean(visib, na.rm = TRUE),
            mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = mean_visibility, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean visibility (miles)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by visibility"
  ) +
  theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_speed, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean wind speed (mph)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind speed"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_dir, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean wind direction (degrees)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind direction"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_humidity = mean(humid, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_humidity, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean humidity (%)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by humidity"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_temp = mean(temp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_temp, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean temperature (degf)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by temperature"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_dewpoint, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean dewpoint (degF)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by dewpoint"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_precip = mean(precip, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_precip, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean precipitation (inches)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by precipitation"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_pressure = mean(pressure, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_pressure, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean pressure (mbar)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by pressure"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

Corrplot

# subset the flights dataset so as to drop NAs, filter on only departure delays (no early departures) and only for Newark Int.
flights_weather <- flights_dt %>% 
  na.omit() %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  select(dep_delay, temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib)

# create correlation matrix
cor_matrix <- cor(flights_weather)
corrplot(cor_matrix, type = "upper", col = cbPalette,
         tl.col = "black", method = "number")

ggpairs

ggpairs(flights_weather)

 plot: [1,1] [>--------------------------------------------------------------------------------------]  1% est: 0s 
 plot: [1,2] [=>-------------------------------------------------------------------------------------]  2% est: 1s 
 plot: [1,3] [==>------------------------------------------------------------------------------------]  4% est: 2s 
 plot: [1,4] [===>-----------------------------------------------------------------------------------]  5% est: 2s 
 plot: [1,5] [====>----------------------------------------------------------------------------------]  6% est: 2s 
 plot: [1,6] [=====>---------------------------------------------------------------------------------]  7% est: 2s 
 plot: [1,7] [=======>-------------------------------------------------------------------------------]  9% est: 2s 
 plot: [1,8] [========>------------------------------------------------------------------------------] 10% est: 2s 
 plot: [1,9] [=========>-----------------------------------------------------------------------------] 11% est: 2s 
 plot: [2,1] [==========>----------------------------------------------------------------------------] 12% est: 2s 
 plot: [2,2] [===========>---------------------------------------------------------------------------] 14% est: 2s 
 plot: [2,3] [============>--------------------------------------------------------------------------] 15% est: 2s 
 plot: [2,4] [=============>-------------------------------------------------------------------------] 16% est: 2s 
 plot: [2,5] [==============>------------------------------------------------------------------------] 17% est: 2s 
 plot: [2,6] [===============>-----------------------------------------------------------------------] 19% est: 2s 
 plot: [2,7] [================>----------------------------------------------------------------------] 20% est: 2s 
 plot: [2,8] [=================>---------------------------------------------------------------------] 21% est: 2s 
 plot: [2,9] [==================>--------------------------------------------------------------------] 22% est: 2s 
 plot: [3,1] [===================>-------------------------------------------------------------------] 23% est: 2s 
 plot: [3,2] [====================>------------------------------------------------------------------] 25% est: 2s 
 plot: [3,3] [======================>----------------------------------------------------------------] 26% est: 2s 
 plot: [3,4] [=======================>---------------------------------------------------------------] 27% est: 2s 
 plot: [3,5] [========================>--------------------------------------------------------------] 28% est: 2s 
 plot: [3,6] [=========================>-------------------------------------------------------------] 30% est: 2s 
 plot: [3,7] [==========================>------------------------------------------------------------] 31% est: 2s 
 plot: [3,8] [===========================>-----------------------------------------------------------] 32% est: 2s 
 plot: [3,9] [============================>----------------------------------------------------------] 33% est: 2s 
 plot: [4,1] [=============================>---------------------------------------------------------] 35% est: 2s 
 plot: [4,2] [==============================>--------------------------------------------------------] 36% est: 2s 
 plot: [4,3] [===============================>-------------------------------------------------------] 37% est: 2s 
 plot: [4,4] [================================>------------------------------------------------------] 38% est: 2s 
 plot: [4,5] [=================================>-----------------------------------------------------] 40% est: 2s 
 plot: [4,6] [==================================>----------------------------------------------------] 41% est: 2s 
 plot: [4,7] [====================================>--------------------------------------------------] 42% est: 2s 
 plot: [4,8] [=====================================>-------------------------------------------------] 43% est: 2s 
 plot: [4,9] [======================================>------------------------------------------------] 44% est: 2s 
 plot: [5,1] [=======================================>-----------------------------------------------] 46% est: 1s 
 plot: [5,2] [========================================>----------------------------------------------] 47% est: 1s 
 plot: [5,3] [=========================================>---------------------------------------------] 48% est: 1s 
 plot: [5,4] [==========================================>--------------------------------------------] 49% est: 1s 
 plot: [5,5] [===========================================>-------------------------------------------] 51% est: 1s 
 plot: [5,6] [============================================>------------------------------------------] 52% est: 1s 
 plot: [5,7] [=============================================>-----------------------------------------] 53% est: 1s 
 plot: [5,8] [==============================================>----------------------------------------] 54% est: 1s 
 plot: [5,9] [===============================================>---------------------------------------] 56% est: 1s 
 plot: [6,1] [================================================>--------------------------------------] 57% est: 1s 
 plot: [6,2] [=================================================>-------------------------------------] 58% est: 1s 
 plot: [6,3] [===================================================>-----------------------------------] 59% est: 1s 
 plot: [6,4] [====================================================>----------------------------------] 60% est: 1s 
 plot: [6,5] [=====================================================>---------------------------------] 62% est: 1s 
 plot: [6,6] [======================================================>--------------------------------] 63% est: 1s 
 plot: [6,7] [=======================================================>-------------------------------] 64% est: 1s 
 plot: [6,8] [========================================================>------------------------------] 65% est: 1s 
 plot: [6,9] [=========================================================>-----------------------------] 67% est: 1s 
 plot: [7,1] [==========================================================>----------------------------] 68% est: 1s 
 plot: [7,2] [===========================================================>---------------------------] 69% est: 1s 
 plot: [7,3] [============================================================>--------------------------] 70% est: 1s 
 plot: [7,4] [=============================================================>-------------------------] 72% est: 1s 
 plot: [7,5] [==============================================================>------------------------] 73% est: 1s 
 plot: [7,6] [===============================================================>-----------------------] 74% est: 1s 
 plot: [7,7] [=================================================================>---------------------] 75% est: 1s 
 plot: [7,8] [==================================================================>--------------------] 77% est: 1s 
 plot: [7,9] [===================================================================>-------------------] 78% est: 1s 
 plot: [8,1] [====================================================================>------------------] 79% est: 1s 
 plot: [8,2] [=====================================================================>-----------------] 80% est: 1s 
 plot: [8,3] [======================================================================>----------------] 81% est: 1s 
 plot: [8,4] [=======================================================================>---------------] 83% est: 0s 
 plot: [8,5] [========================================================================>--------------] 84% est: 0s 
 plot: [8,6] [=========================================================================>-------------] 85% est: 0s 
 plot: [8,7] [==========================================================================>------------] 86% est: 0s 
 plot: [8,8] [===========================================================================>-----------] 88% est: 0s 
 plot: [8,9] [============================================================================>----------] 89% est: 0s 
 plot: [9,1] [=============================================================================>---------] 90% est: 0s 
 plot: [9,2] [==============================================================================>--------] 91% est: 0s 
 plot: [9,3] [================================================================================>------] 93% est: 0s 
 plot: [9,4] [=================================================================================>-----] 94% est: 0s 
 plot: [9,5] [==================================================================================>----] 95% est: 0s 
 plot: [9,6] [===================================================================================>---] 96% est: 0s 
 plot: [9,7] [====================================================================================>--] 98% est: 0s 
 plot: [9,8] [=====================================================================================>-] 99% est: 0s 
 plot: [9,9] [=======================================================================================]100% est: 0s 
                                                                                                                   

Subset flights

flights_numerical_all <- flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  na.omit() %>% 
  select_if(., is.numeric) %>%  # select all numeric variables
  select(-arr_delay)
  
flights_numerical_all

Simple linear regression

# build a simple linear regression model using dep_delay and the weather variables
model <- lm(dep_delay ~ ., data = flights_weather)

# view results 
summary(model)

Call:
lm(formula = dep_delay ~ ., data = flights_weather)

Residuals:
   Min     1Q Median     3Q    Max 
-59.90 -29.00 -16.77  10.88 815.29 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 631.659739  40.355182  15.653  < 2e-16 ***
temp          0.463713   0.135562   3.421 0.000625 ***
dewp         -0.435947   0.145838  -2.989 0.002798 ** 
humid         0.467200   0.076075   6.141 8.26e-10 ***
wind_dir     -0.017681   0.002832  -6.243 4.33e-10 ***
wind_speed    0.486040   0.054098   8.985  < 2e-16 ***
precip       -8.181936  17.295980  -0.473 0.636177    
pressure     -0.623965   0.038058 -16.395  < 2e-16 ***
visib         0.297599   0.213728   1.392 0.163802    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 48.57 on 40553 degrees of freedom
Multiple R-squared:  0.025, Adjusted R-squared:  0.02481 
F-statistic:   130 on 8 and 40553 DF,  p-value: < 2.2e-16
model <- lm(dep_delay ~ ., data = flights_numerical_all)
summary(model)

Call:
lm(formula = dep_delay ~ ., data = flights_numerical_all)

Residuals:
   Min     1Q Median     3Q    Max 
-72.72 -27.16 -13.85   9.58 814.57 

Coefficients: (1 not defined because of singularities)
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   4.509e+02  4.220e+01  10.683  < 2e-16 ***
air_time     -6.738e-02  1.961e-02  -3.437 0.000589 ***
distance     -1.711e-03  3.154e-03  -0.542 0.587581    
flight        1.037e-03  2.755e-04   3.764 0.000167 ***
engines      -1.625e+00  6.519e+00  -0.249 0.803125    
seats        -6.355e-02  7.564e-03  -8.402  < 2e-16 ***
aircraft_age  2.627e-01  4.894e-02   5.368 8.03e-08 ***
lat          -9.124e-02  7.672e-02  -1.189 0.234325    
lon          -4.013e-01  1.065e-01  -3.767 0.000165 ***
alt           4.754e-04  2.472e-04   1.923 0.054510 .  
wind_dir     -1.445e-02  2.764e-03  -5.227 1.73e-07 ***
humid         6.034e-01  7.430e-02   8.121 4.75e-16 ***
hour          1.754e+00  5.545e-02  31.635  < 2e-16 ***
minute       -1.034e-02  1.309e-02  -0.790 0.429545    
year                 NA         NA      NA       NA    
temp          5.248e-01  1.323e-01   3.967 7.29e-05 ***
dewp         -5.353e-01  1.423e-01  -3.762 0.000169 ***
wind_speed    4.901e-01  5.279e-02   9.283  < 2e-16 ***
precip       -1.904e+01  1.688e+01  -1.128 0.259325    
pressure     -4.965e-01  3.751e-02 -13.237  < 2e-16 ***
visib         4.131e-01  2.086e-01   1.980 0.047704 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 47.37 on 40542 degrees of freedom
Multiple R-squared:  0.07253,   Adjusted R-squared:  0.07209 
F-statistic: 166.9 on 19 and 40542 DF,  p-value: < 2.2e-16

glmulti

glmulti_fit <- glmulti(
  dep_delay ~ ., 
  data = flights_weather,
  level = 2, # 2 = include pairwise interactions, 1 = main effects only (main effect = no pairwise interactions)
  minsize = 0, # no min size of model
  maxsize = -1, # -1 = no max size of model
  marginality = TRUE, # marginality here means the same as 'strongly hierarchical' interactions, i.e. include pairwise interactions only if both predictors present in the model as main effects.
  method = "g", # the problem is too large for exhaustive search, so search using a genetic algorithm
  crit = bic, # criteria for model selection is BIC value (lower is better)
  plotty = FALSE, # don't plot models as function runs
  report = TRUE, # do produce reports as function runs
  confsetsize = 10, # return best 100 solutions
  fitfunction = lm # fit using the `lm` function
)
---
title: "Flights Exploratory Analysis"
output: html_notebook
---

The purpose of exploratory analysis is to perform initial investigations on the data so as to discover patterns, identify any anomalies, to test initial hypothesis and check assumptions made at the start of the project. 

In this section of the project we will ask two types of questions: 

1. what type of variation occurs within my variables? 
2. what type of covariation occurs between my variables? 

Variation being the tendency for values of a variable to change from measurement to measurement, and covariance is the tendency for two or more variables to vary together in a related way.

```{r, warning=FALSE,message=FALSE}
pacman::p_load(
  tidyverse,
  here,
  RColorBrewer,
  lubridate,
  scales,
  GGally,
  stats,
  corrplot,
  leaps,
  glmulti,
  broom
)
```

```{r, warning=FALSE,message=FALSE}
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
```

# Which airport in our dataset has the highest number of departures?

```{r}
flights_dt %>% 
  group_by(origin) %>%
  ggplot() +
  aes(x = origin, fill = origin) +
  geom_bar() +
  scale_fill_manual(values = cbPalette) +
  labs(
    x = "airport",
    y = "departure numbers",
    title = "Departure numbers by airport"
  ) +
  guides(fill = "none") +
  theme_bw()
```

# What is the trend of delays over the year, how does it compare across the three airports?

```{r,warning=FALSE}
flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = mean_delay, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "mean delay (minutes)",
    title = " Mean delay by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# What is the number of departure delays over the year?

```{r}
flights_dt %>%
  filter(dep_delay > 0) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_delays, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "number of delays",
    title = " Monthly trend of departure delays",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# What is the number of flights per month?

```{r}
flights_dt %>%
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_flights = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_flights, fill = origin) +
  geom_col(position = "dodge", alpha = 0.8) +
  labs(
    x = "month",
    y = "number of departures",
    title = "Departure numbers by month",
    fill = "airport"
  ) +
  scale_fill_manual(values = cbPalette) +
  theme_bw()
```

# Which carrier has the most departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean departure delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which carrier has the most arrival delays?

```{r}
flights_dt %>% 
  filter(arr_delay > 0) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(arr_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean arrival delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which season has the highest number of departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = day, y = mean_delay, colour = origin) +
  geom_point(alpha = 0.8) +
  labs(
    x = "date",
    y = "mean delay (minutes)",
    title = "Mean departure delay by month"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# Over the course of a day when is the largest average delay?

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  select(dep_delay, hour) %>%
  group_by(hour) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = hour, y = mean_delay) +
  geom_point(alpha = 0.8) +
  geom_smooth(se = FALSE) +
  labs(
    x = "hour",
    y = "mean delay (minutes)",
    title = "Mean departure delay by departure time"
  ) +
  theme_bw() +
  scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
```
# Which are the most popular destinations from Newark Int?

```{r}
flights_dt %>% 
  filter(origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(desc(count)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, count), y = count) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "number of flights per year",
    title = "Destinations from Newark Int."
  ) +
  theme_bw() +
  coord_flip()
```

# Which destinations suffer from the longest delays?

```{r}
flights_dt %>%
  drop_na(dest_name) %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  arrange(desc(mean_delay)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "mean delay (minutes)",
    title = "Destinations with longest delays"
  ) +
  theme_bw() +
  coord_flip()
```

# Trend weather conditions against mean departure delays

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_visibility = mean(visib, na.rm = TRUE),
            mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = mean_visibility, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean visibility (miles)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by visibility"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_speed, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean wind speed (mph)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind speed"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_dir, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean wind direction (degrees)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind direction"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_humidity = mean(humid, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_humidity, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean humidity (%)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by humidity"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_temp = mean(temp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_temp, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean temperature (degf)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by temperature"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_dewpoint, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean dewpoint (degF)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by dewpoint"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_precip = mean(precip, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_precip, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean precipitation (inches)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by precipitation"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_pressure = mean(pressure, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_pressure, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "mean pressure (mbar)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by pressure"
  ) +
  theme_bw()
```

# Corrplot

```{r}
# subset the flights dataset so as to drop NAs, filter on only departure delays (no early departures) and only for Newark Int.
flights_weather <- flights_dt %>% 
  na.omit() %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  select(dep_delay, temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib)

# create correlation matrix
cor_matrix <- cor(flights_weather)
corrplot(cor_matrix, type = "upper", col = cbPalette,
         tl.col = "black", method = "number")
```

# ggpairs

```{r}
ggpairs(flights_weather)
```

# Subset flights

```{r}
flights_numerical_all <- flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  na.omit() %>% 
  select_if(., is.numeric) %>%  # select all numeric variables
  select(-arr_delay)
  
flights_numerical_all
```

# Simple linear regression

```{r}
# build a simple linear regression model using dep_delay and the weather variables
model <- lm(dep_delay ~ ., data = flights_weather)

# view results 
summary(model)
```

```{r}
model <- lm(dep_delay ~ ., data = flights_numerical_all)
summary(model)
```

# glmulti

```{r}
glmulti_fit <- glmulti(
  dep_delay ~ ., 
  data = flights_weather,
  level = 2, # 2 = include pairwise interactions, 1 = main effects only (main effect = no pairwise interactions)
  minsize = 0, # no min size of model
  maxsize = -1, # -1 = no max size of model
  marginality = TRUE, # marginality here means the same as 'strongly hierarchical' interactions, i.e. include pairwise interactions only if both predictors present in the model as main effects.
  method = "g", # the problem is too large for exhaustive search, so search using a genetic algorithm
  crit = bic, # criteria for model selection is BIC value (lower is better)
  plotty = FALSE, # don't plot models as function runs
  report = TRUE, # do produce reports as function runs
  confsetsize = 10, # return best 100 solutions
  fitfunction = lm # fit using the `lm` function
)
```

